Importing the libraries¶

In [3]:
# Importing the libraries 
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import os
from pyspark.sql.functions import length
from pyspark.sql import functions as F

Exploratory Data Analysis¶

Read Data from Spark¶

Data Cleaning¶

In [4]:
datastore = 'azureml://datastores/workspaceblobstore/paths/'
submissions_path = 'filtered-submissions'
submissions_df = spark.read.parquet(f"{datastore}{submissions_path}")

# take a subset of columns
df = submissions_df.select("subreddit", "author", "title", "selftext",
                             "created_utc", "num_comments", "score", 
                             "over_18", "media", "pinned", "locked", 
                             "disable_comments", "domain", "hidden", 
                             "distinguished", "hide_score")

# calculate post length
df = df.withColumn('post_length', length(df.title) + length(df.selftext))


df = df.withColumn('created_utc', F.to_timestamp('created_utc'))

# Extract time-based features
df = df.withColumn('hour_of_day', F.hour('created_utc'))
df = df.withColumn('day_of_week', F.dayofweek('created_utc'))  # 1 (Sunday) to 7 (Saturday)
# Map each day of the week from numeric to string
df = df.withColumn('day_of_week_str', F.expr("""
    CASE day_of_week 
        WHEN 1 THEN 'Sunday'
        WHEN 2 THEN 'Monday'
        WHEN 3 THEN 'Tuesday'
        WHEN 4 THEN 'Wednesday'
        WHEN 5 THEN 'Thursday'
        WHEN 6 THEN 'Friday'
        WHEN 7 THEN 'Saturday'
    END
"""))
df = df.withColumn('day_of_month', F.dayofmonth('created_utc'))
df = df.withColumn('month', F.month('created_utc'))
df = df.withColumn('year', F.year('created_utc'))

df = df.withColumn('has_media', F.col('media').isNotNull())

df = df.drop(*["media", "disable_comments", "distinguished"])
StatementMeta(9d2667d4-d95f-4c64-adf7-bfab734cf5c0, 108, 9, Finished, Available)
In [3]:
PLOT_DIR = os.path.join("Users/sk2224/fall-2023-reddit-project-team-34/data", "plots")
CSV_DIR = os.path.join("Users/sk2224/fall-2023-reddit-project-team-34/data", "csv")
StatementMeta(9d2667d4-d95f-4c64-adf7-bfab734cf5c0, 105, 8, Finished, Available)

Scatterplot for Engagement metrics for Reddit posts:¶

In [4]:
df_plotly = df.select(["subreddit", "num_comments", "score", "has_media", "post_length"])
df_plotly = df_plotly.filter(df_plotly.subreddit.isin('movies', 'anime', 'television'))
df_plotly_pd = df_plotly.toPandas()

color_map = {
    'movies': '#FF4301',
    'anime': '#ff9200',  
    'television': '#ffe100' 
}

# Create the scatter plot with custom colors
fig = px.scatter(
    df_plotly_pd, 
    y='post_length', 
    x='score', 
    color='subreddit',
    color_discrete_map=color_map,  # Use the custom color map
    size='num_comments', 
    labels={'num_comments': 'Number of Comments', 'score': 'Score', 
            'subreddit': 'Subreddit', 'post_length': 'Post Length'},
    title='Engagement Dynamics of Reddit Posts Across Entertainment Subreddits'
)

# Update layout and axis limits
fig.update_layout(plot_bgcolor='white', paper_bgcolor='white')
fig.update_xaxes(range=[0, 50000])
fig.update_yaxes(range=[0, 8000])

# Show the plot
fig.show()

fig.write_html(f"{PLOT_DIR}/engagement_eda.html")
StatementMeta(9d2667d4-d95f-4c64-adf7-bfab734cf5c0, 48, 9, Finished, Available)

Distribution of Reddits posts over time from 2021¶

Count and Average Score of posts from 2021-2023¶

In [24]:
df_datetime_pd = pd.read_csv("../../data/csv/year_month_day_eda.csv")
df_datetime_pd.head()
Out[24]:
subreddit day month year count
0 movies 19 11 2022 320
1 television 16 6 2021 113
2 anime 31 3 2021 766
3 television 18 5 2021 135
4 television 19 7 2022 132
In [4]:
df_datetime_avg_score_pd = pd.read_csv("../../data/csv/year_month_day_avgscore_eda.csv")
df_datetime_avg_score_pd.head()
Out[4]:
subreddit year month average_score
0 movies 2021 5 92.467274
1 movies 2021 10 153.312225
2 television 2021 3 326.358652
3 television 2021 6 278.586502
4 anime 2021 9 71.181420
In [25]:
df_datetime_pd_ym = df_datetime_pd.groupby(["month", "year", "subreddit"], as_index=False)["count"].sum()
# Convert year, month, and day_of_month to a datetime column in Pandas
df_datetime_pd_ym['date'] = pd.to_datetime(df_datetime_pd_ym[['year', 'month']].assign(day=1))
df_datetime_pd_ym = df_datetime_pd_ym.sort_values(by="date")
df_datetime_pd_ym.head()
Out[25]:
month year subreddit count date
0 1 2021 anime 22775 2021-01-01
1 1 2021 movies 15673 2021-01-01
2 1 2021 television 3513 2021-01-01
11 2 2021 television 3629 2021-02-01
10 2 2021 movies 15617 2021-02-01
In [9]:
df_datetime_avg_score_pd['date'] = pd.to_datetime(df_datetime_avg_score_pd[['year', 'month']].assign(day=1))
df_datetime_avg_score_pd = df_datetime_avg_score_pd.sort_values(by="date")
df_datetime_avg_score_pd.head()
Out[9]:
subreddit year month average_score date
20 movies 2021 1 169.982262 2021-01-01
26 anime 2021 1 66.665379 2021-01-01
23 television 2021 1 376.816112 2021-01-01
24 anime 2021 2 67.384173 2021-02-01
21 television 2021 2 326.005511 2021-02-01
In [26]:
# Define the division factors in a dictionary
divisors = {
    'movies': 382085,
    'anime': 404298,
    'television': 89586
}

# Function to apply the custom division based on the subreddit
def custom_divide(row):
    return row['count'] / divisors[row['subreddit']]

# Apply the function to each row
df_datetime_pd_ym['normalized_count'] = df_datetime_pd_ym.apply(custom_divide, axis=1)
In [27]:
# Custom color map
color_map = {
    'movies': '#FF4301',
    'anime': '#ff9200',  
    'television': '#ffe100' 
}

# Create the time series plot using Plotly
fig = px.line(
    df_datetime_pd_ym,
    x='date',
    y='normalized_count',
    color='subreddit',
    color_discrete_map=color_map,  # Use the custom color map
    labels={'count': 'Post Count', 'date': 'Date', 'subreddit': 'Subreddit'},
    line_shape="spline",
    title='Number of posts across the years (2021-2023)',
    render_mode='svg'
)

# Improve the clarity of the plot
fig.update_traces(
    line=dict(width=2),  # Thinner line
    mode='lines+markers',  # Show markers as well as lines
    marker=dict(size=4, opacity=0.6),  # Smaller markers with some transparency
    opacity=0.7  # Lines are a bit transparent to reduce visual clutter
)

# Enhance the layout
fig.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99
    )
)

# Add range slider for interactivity
fig.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1, label="1M", step="month", stepmode="backward"),
                dict(count=6, label="6M", step="month", stepmode="backward"),
                dict(count=1, label="1Y", step="year", stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            visible=True
        )
    )
)

fig.update_yaxes(title_text='Normalized Post Count <br> (Post Count / Total Count)')
fig.update_xaxes(title_text = 'Date (2021-2023)')

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
    width=1000,   # Increasing width
    height=600   # Increasing height
)

# Show the plot
fig.show()

fig.write_html(f"../../data/plots/time_series_eda.html")
In [23]:
# Custom color map
color_map = {
    'movies': '#FF4301',
    'anime': '#ff9200',  
    'television': '#ffe100' 
}

# Create the time series plot using Plotly
fig = px.line(
    df_datetime_avg_score_pd,
    x='date',
    y='average_score',
    color='subreddit',
    color_discrete_map=color_map,  # Use the custom color map
    labels={'average_score': 'Average Score', 'date': 'Date', 'subreddit': 'Subreddit'},
    line_shape="spline",
    title='Average score of posts across the years (2021-2023)',
    render_mode='svg'
)

# Improve the clarity of the plot
fig.update_traces(
    line=dict(width=2),  # Thinner line
    mode='lines+markers',  # Show markers as well as lines
    marker=dict(size=4, opacity=0.6),  # Smaller markers with some transparency
    opacity=0.7  # Lines are a bit transparent to reduce visual clutter
)

# Enhance the layout
fig.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99
    )
)

# Add range slider for interactivity
fig.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1, label="1M", step="month", stepmode="backward"),
                dict(count=6, label="6M", step="month", stepmode="backward"),
                dict(count=1, label="1Y", step="year", stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            visible=True
        )
    )
)

fig.update_yaxes(title_text='Average Score')
fig.update_xaxes(title_text = 'Date (2021-2023)')

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
    width=1000,   # Increasing width
    height=600   # Increasing height
)

# Show the plot
fig.show()

fig.write_html(f"../../data/plots/time_series_score_eda.html")

Count and Average Score across different days of the month¶

In [37]:
day_of_month_pd = pd.read_csv(f"../../data/csv/day_of_month_avg_eda.csv")
day_of_month_count_pd = pd.read_csv("../../data/csv/daily_weekly_count_eda.csv")
In [38]:
day_of_month_pd = day_of_month_pd.sort_values(by='day_of_month')
day_of_month_count_pd = day_of_month_count_pd.sort_values(by='day_of_month')
In [39]:
day_of_month_pd_1 = day_of_month_pd[day_of_month_pd["subreddit"].isin(["anime", "movies", "television"])]
day_of_month_pd_2 = day_of_month_pd[day_of_month_pd["subreddit"].isin(["Animesuggest", "televisionsuggestions", "MovieSuggestions"])]
In [40]:
color_map = {
    'movies': '#FF4301',
    'anime': '#ff9200',  
    'television': '#ffe100' 
}

# Create the line chart 
fig = px.line(
    day_of_month_pd_1, 
    x='day_of_month', 
    y='average_score', 
    color='subreddit',
    color_discrete_map=color_map,
    labels={'average_score': 'Average Score', 'day_of_month': 'Day of the Month'},
    title='Average Score by Day of Month Across Subreddits',
    line_shape="spline",
)

fig.update_xaxes(range=[1, 31])  
fig.update_layout(plot_bgcolor='white', paper_bgcolor='white')

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
    width=800,   # Increasing width
    height=600   # Increasing height
)

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
    width=1000,   # Increasing width
    height=600   # Increasing height
)

# Show the plot
fig.show()

fig.write_html(f"../../data/plots/avg_score_eda.html")
In [42]:
color_map = {
    'movies': '#FF4301',
    'anime': '#ff9200',  
    'television': '#ffe100' 
}

# Create the line chart 
fig = px.line(
    day_of_month_count_pd, 
    x='day_of_month', 
    y='count', 
    color='subreddit',
    color_discrete_map=color_map,
    labels={'average_score': 'Average Score', 'day_of_month': 'Day of the Month'},
    title='Count of posts by Day of Month Across Subreddits',
    line_shape="spline",
)

fig.update_xaxes(range=[1, 31])  
fig.update_layout(plot_bgcolor='white', paper_bgcolor='white')

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
    width=800,   # Increasing width
    height=600   # Increasing height
)

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
    width=1000,   # Increasing width
    height=600   # Increasing height
)

# Show the plot
fig.show()

fig.write_html(f"../../data/plots/day_of_month_count_eda.html")

Count and Average Score of posts across different days of week and hours of day¶

In [4]:
df_daily_weekly = df.groupBy(["day_of_week_str", "hour_of_day", "subreddit"]).count().toPandas()
StatementMeta(9d2667d4-d95f-4c64-adf7-bfab734cf5c0, 64, 9, Finished, Available)
In [10]:
df_daily_weekly.to_csv(f"{CSV_DIR}/daily_weekly_eda.csv", index=False)
StatementMeta(9d2667d4-d95f-4c64-adf7-bfab734cf5c0, 64, 15, Finished, Available)
In [33]:
df_daily_weekly = pd.read_csv(f"../../data/csv/daily_weekly_eda.csv")
df_daily_weekly.head()
Out[33]:
day_of_week_str hour_of_day subreddit count
0 Friday 8 anime 1768
1 Friday 14 television 791
2 Tuesday 15 movies 2932
3 Monday 16 movies 3206
4 Monday 18 television 900
In [16]:
df_daily_weekly_avgscore = pd.read_csv("../../data/csv/daily_weekly_avgscore_eda.csv")
df_daily_weekly_avgscore = df_daily_weekly_avgscore.sort_values(by="hour_of_day")
df_daily_weekly_avgscore.head()
Out[16]:
subreddit hour_of_day day_of_week_str average_score
373 movies 0 Thursday 102.478999
405 movies 0 Sunday 99.315640
312 anime 0 Friday 51.029256
277 anime 0 Monday 19.653784
109 television 0 Saturday 215.709350
In [34]:
df_daily_weekly['normalized_count'] = df_daily_weekly.apply(custom_divide, axis=1)
In [35]:
color_map = {
    'movies': '#FF4301',
    'anime': '#ff9200',  
    'television': '#ffe100' 
}
fig = px.sunburst(df_daily_weekly, 
                path=['subreddit', 'day_of_week_str', 'hour_of_day'], 
                values='count', 
                color='subreddit',
                color_discrete_map=color_map,
                title="Distribution of posts across Days of Week and Hours of Day")

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
    width=800,   # Increasing width
    height=600   # Increasing height
)

fig.show()
fig.write_html(f"../../data/plots/sunburst_eda.html")
In [36]:
color_map = {
    'movies': '#FF4301',
    'anime': '#ff9200',  
    'television': '#ffe100' 
}
fig = px.sunburst(df_daily_weekly_avgscore, 
                path=['subreddit', 'day_of_week_str', 'hour_of_day'], 
                values='average_score', 
                color='subreddit',
                color_discrete_map=color_map,
                title="Average scores of posts across Days of Week and Hours of Day")

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
    width=800,   # Increasing width
    height=600   # Increasing height
)

fig.show()
fig.write_html(f"../../data/plots/sunburst_avgscore_eda.html")

Analysis of post counts for Authors with highest score¶

In [77]:
df_top_posts_scores = pd.read_csv(f"../../data/csv/top_author_score_postcount_eda.csv")
df_top_posts_scores.head()
Out[77]:
Unnamed: 0 subreddit title num_comments selftext author score rank count
0 0 anime "Berserk" creator Kentaro Miura dead at 54 1762 NaN enterthedragonpunch 33384 1 1
1 1 anime Who will be the first seed in Best Girl 8? 619 Hi everyone, we are currently trialing a new f... mpp00 31830 2 241
2 2 anime Best Girl 9 Prediction Tournament! 264 NaN mpp00 30302 3 241
3 3 anime The Devil is a Part-Timer Season 2 Announced! 2486 NaN Srikkk 30213 4 32
4 4 anime "Spice and Wolf" New Anime Announced 1897 NaN dorkmax_executives 29222 5 304
In [79]:
df_top_posts_scores_post_count = pd.read_csv(f"../../data/csv/top_author_score_postcount_eda.csv")
df_top_posts_scores_post_count.head()
Out[79]:
Unnamed: 0 subreddit title num_comments selftext author score rank count
0 0 anime "Berserk" creator Kentaro Miura dead at 54 1762 NaN enterthedragonpunch 33384 1 1
1 1 anime Who will be the first seed in Best Girl 8? 619 Hi everyone, we are currently trialing a new f... mpp00 31830 2 241
2 2 anime Best Girl 9 Prediction Tournament! 264 NaN mpp00 30302 3 241
3 3 anime The Devil is a Part-Timer Season 2 Announced! 2486 NaN Srikkk 30213 4 32
4 4 anime "Spice and Wolf" New Anime Announced 1897 NaN dorkmax_executives 29222 5 304
In [81]:
movie_author_df = df_top_posts_scores_post_count[df_top_posts_scores_post_count['subreddit'] == 'movies'].sort_values(by="count", ascending=False)[["author", "count"]].drop_duplicates()
tv_author_df = df_top_posts_scores_post_count[df_top_posts_scores_post_count['subreddit'] == 'television'].sort_values(by="count", ascending=False)[["author", "count"]].drop_duplicates()
anime_author_df = df_top_posts_scores_post_count[df_top_posts_scores_post_count['subreddit'] == 'anime'].sort_values(by="count", ascending=False)[["author", "count"]].drop_duplicates()

# Create figure
fig = go.Figure()

# Add surface trace
# Add bar trace for movie_author_df
fig.add_trace(go.Bar(x=movie_author_df['author'], y=movie_author_df['count'], name='Movies', visible=True, marker_color='#FF4301'))

# Add bar trace for tv_author_df
fig.add_trace(go.Bar(x=tv_author_df['author'], y=tv_author_df['count'], name='TV Shows', visible=False, marker_color='#ffe100'))

# Add bar trace for anime_author_df
fig.add_trace(go.Bar(x=anime_author_df['author'], y=anime_author_df['count'], name='Anime', visible=False,  marker_color='#ff9200'))

# Update layout for better visualization
fig.update_layout(
    plot_bgcolor='white',  # Set background color to white
     xaxis=dict(title_text="Author"), # Set x-axis line color
     yaxis=dict(

        title_text="Post Count",
        # range=[0, max(movie_author_df['count'].max(), tv_author_df['count'].max(), anime_author_df['count'].max()) + 1000]
    ),
    updatemenus=[
       
        dict(
            active=0,
            buttons=list([
                 dict(label="Movie",
                     method="update",
                     args=[{"visible": [True, False, False]},
                           {"title": "Top 10 authors (with highest scores) for movies subreddit"}]),
                dict(label="Television",
                     method="update",
                     args=[{"visible": [False, True, False]},
                           {"title": "Top 10 authors (with highest scores) for television subreddit"}]),
                dict(label="Anime",
                     method="update",
                     args=[{"visible": [False, False, True]},
                           {"title": "Top 10 authors (with highest scores) for anime subreddit"}]),
                # dict(label="All",
                #      method="update",
                #      args=[{"visible": [True, True, True]},
                #            {"title": "Top 10 active authors across all 3 subreddits"}]),
            ]),
            x=0.75,  # Adjusted the dropdown position to the top
            xanchor='left',  # Anchored the dropdown to the left
            y=1.25,  # Adjusted the dropdown position to the top
            yanchor='top'  # Anchored the dropdown to the top
        )
    ])

# Set title
fig.update_layout(title_text="Top 10 authors (with highest scores) for movies subreddit")
fig.show()

fig.write_html(f"../../data/plots/top10_authorscore_postcount_eda.html")

Top Authors with Top Comments¶

In [82]:
authors_with_top_comments_post_counts = pd.read_csv("../../data/csv/authors_with_top_comments_post_counts.csv")
In [84]:
movie_author_df = authors_with_top_comments_post_counts[authors_with_top_comments_post_counts['subreddit'] == 'movies'].sort_values(by="count", ascending=False)[["author", "count"]].drop_duplicates()
tv_author_df = authors_with_top_comments_post_counts[authors_with_top_comments_post_counts['subreddit'] == 'television'].sort_values(by="count", ascending=False)[["author", "count", "num_comments"]].drop_duplicates()
anime_author_df = authors_with_top_comments_post_counts[authors_with_top_comments_post_counts['subreddit'] == 'anime'].sort_values(by="count", ascending=False)[["author", "count"]].drop_duplicates()
tv_author_df = tv_author_df.nlargest(10, 'num_comments')

tv_author_df.drop(columns=['num_comments'], axis=1, inplace=True)
tv_author_df = tv_author_df.sort_values(by = "count", ascending = False)

# Create figure
fig = go.Figure()

# Add surface trace
# Add bar trace for movie_author_df
fig.add_trace(go.Bar(x=movie_author_df['author'], y=movie_author_df['count'], name='Movies', visible=True, marker_color='#FF4301'))

# Add bar trace for tv_author_df
fig.add_trace(go.Bar(x=tv_author_df['author'], y=tv_author_df['count'], name='TV Shows', visible=False, marker_color='#ffe100'))

# Add bar trace for anime_author_df
fig.add_trace(go.Bar(x=anime_author_df['author'], y=anime_author_df['count'], name='Anime', visible=False,  marker_color='#ff9200'))

# Update layout for better visualization
fig.update_layout(
    plot_bgcolor='white',  # Set background color to white
     xaxis=dict(title_text="Author"), # Set x-axis line color
     yaxis=dict(

        title_text="Post Count",
        # range=[0, max(movie_author_df['count'].max(), tv_author_df['count'].max(), anime_author_df['count'].max()) + 1000]
    ),
    updatemenus=[
       
        dict(
            active=0,
            buttons=list([
                 dict(label="Movie",
                     method="update",
                     args=[{"visible": [True, False, False]},
                           {"title": "Top 10 authors (ith top comments) for movies subreddit"}]),
                dict(label="Television",
                     method="update",
                     args=[{"visible": [False, True, False]},
                           {"title": "Top 10 authors (ith top comments) for television subreddit"}]),
                dict(label="Anime",
                     method="update",
                     args=[{"visible": [False, False, True]},
                           {"title": "Top 10 authors (ith top comments) for anime subreddit"}]),
                # dict(label="All",
                #      method="update",
                #      args=[{"visible": [True, True, True]},
                #            {"title": "Top 10 active authors across all 3 subreddits"}]),
            ]),
            x=0.85,  # Adjusted the dropdown position to the top
            xanchor='left',  # Anchored the dropdown to the left
            y=1.25,  # Adjusted the dropdown position to the top
            yanchor='top'  # Anchored the dropdown to the top
        )
    ])

# Set title
fig.update_layout(title_text="Top 10 authors (with top comments) for movies subreddit")
fig.show()

fig.write_html(f"../../data/plots/Top 10 authors (with top comments).html")

Table to display missing values in dataset¶

In [31]:
# Loading the dataset 
df = pd.read_csv("../../data/csv/num_missing_val.csv") 
df.head()
Out[31]:
Column Missing Values
0 subreddit 0
1 author 0
2 title 0
3 selftext 0
4 created_utc 0
In [32]:
#rename columns
df.rename(columns={'Column': 'Column Name'}, inplace=True)
In [88]:
# Sort the DataFrame by 'Missing Values' in descending order
df_sorted = df.sort_values(by='Missing Values', ascending=False)

fig = go.Figure(data=[go.Table(
    header=dict(values=list(df_sorted.columns),
                fill_color='#FF4301',
                font=dict(color='white'),  # Set font color for header
                align='left'),
    cells=dict(values=[df_sorted['Column Name'], df_sorted['Missing Values']],
               fill_color='lightgrey',
               align='left'))
])
fig.update_layout(title=dict(text="Distribution of missing values"))
# Export the figure to a PNG file
fig.write_html("../../data/plots/table_missing_values.html")

fig.show()

Subreddit count¶

In [34]:
df_subreddit = pd.read_csv("../../data/csv/subreddit_count.csv") 
df_subreddit
Out[34]:
Unnamed: 0 subreddit count
0 0 anime 404298
1 1 television 89586
2 2 televisionsuggestions 7991
3 3 movies 382085
4 4 Animesuggest 74101
5 5 MovieSuggestions 58907
In [35]:
df_subreddit =df_subreddit.replace("anime", "r/anime")
df_subreddit =df_subreddit.replace("television", "r/television")
df_subreddit =df_subreddit.replace("televisionsuggestions", "r/televisionsuggestions")
df_subreddit =df_subreddit.replace("movies", "r/movies")
df_subreddit =df_subreddit.replace("Animesuggest", "r/Animesuggest")
df_subreddit =df_subreddit.replace("MovieSuggestions", "r/MovieSuggestions")
In [37]:
# Reorder it based on the values:
ordered_df = df_subreddit.sort_values(by='count')
my_range=range(1,len(df_subreddit.index)+1)

# Horizontal version
plt.hlines(y=my_range, xmin=0, xmax=ordered_df['count'], color='lightgrey')
plt.plot(ordered_df['count'], my_range, "D", markerfacecolor='#FF4301', markeredgecolor='#FF4301')
plt.yticks(my_range, ordered_df['subreddit'])
plt.gca().get_xaxis().set_major_formatter(ticker.FuncFormatter(lambda x, _: f'{int(x / 1000)}K'))
plt.xlim(0, ordered_df['count'].max() + 5000) 

# Annotate count numbers over the diamonds, pushing them more to the left
for i, count in zip(my_range, ordered_df['count']):
    plt.annotate(f'{int(count):,}', xy=(count + 5300, i), ha='left', va='center', fontsize=10, color='black')

# Add x-axis and y-axis labels
plt.xlabel('Count', fontsize=13)  
plt.ylabel('Subreddit', fontsize=13)  

# Set background color to none
plt.gca().set_facecolor('none')

# Remove borders
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Add caption at the top
plt.text(0.5, 1.05, 'Subreddit Counts Analysis', ha='center', va='center', fontsize=16,transform=plt.gca().transAxes)

# Save the figure as a PNG file
plt.savefig("../../data/plots/subreddit_count_analysis.png", bbox_inches='tight', dpi=300)

plt.show()

Table author count¶

In [86]:
#df_subreddit = df_subreddit.drop(columns=['Unnamed: 0'])
# Sort the DataFrame by 'Missing Values' in descending order
df_subreddit_sorted = df_subreddit.sort_values(by='count', ascending=False)
In [87]:
fig = go.Figure(data=[go.Table(
    header=dict(values=list(df_subreddit_sorted.columns),
                fill_color='#FF4301',
                font=dict(color='white'),  # Set font color for header
                align='left'),
    cells=dict(values=[df_subreddit_sorted['subreddit'], df_subreddit_sorted['count']],
               fill_color='lightgrey',
               align='left'))
])

fig.update_layout(title=dict(text="Distribution of Subreddits posts"))

# Export the figure to a PNG file
fig.write_html("../../data/plots/table_subreddit_count.html")

fig.show()

Plot for authors with top post¶

In [40]:
# load dataset
author_df = pd.read_csv("../../data/csv/author_eda.csv")
author_df
Out[40]:
Unnamed: 0 author subreddit count rank
0 1 AutoLovepon anime 6982 2
1 2 Turbostrider27 anime 2215 3
2 3 Gvostfr anime 1677 4
3 4 Lovro26 anime 1450 5
4 5 inspyral anime 787 6
5 6 AnimeMod anime 755 7
6 7 SuperAlloyBerserker anime 595 8
7 8 Shimmering-Sky anime 557 9
8 9 RobotiSC anime 517 10
9 10 SorcererOfTheLake anime 441 11
10 12 wednesdaygiftinfo movies 24222 2
11 13 allthebestmovies movies 10353 3
12 14 Alternative-Bat-2458 movies 3538 4
13 15 Ornery-Control-9474 movies 3475 5
14 16 Sisiwakanamaru movies 1861 6
15 17 MovieBattleGame movies 1471 7
16 18 MarvelsGrantMan136 movies 1362 8
17 19 chanma50 movies 939 9
18 20 Samoht99 movies 842 10
19 21 indig0sixalpha movies 820 11
20 23 MarvelsGrantMan136 television 1960 2
21 24 Sisiwakanamaru television 1139 3
22 25 klutzysunshine television 966 4
23 26 chanma50 television 723 5
24 27 misana123 television 663 6
25 28 PetyrDayne television 657 7
26 29 Neo2199 television 602 8
27 30 indig0sixalpha television 564 9
28 31 GroundbreakingSet187 television 436 10
29 32 DemiFiendRSA television 403 11
In [41]:
movie_author_df = author_df[author_df['subreddit'] == 'movies']
tv_author_df = author_df[author_df['subreddit'] == 'television']
anime_author_df = author_df[author_df['subreddit'] == 'anime']
In [89]:
# Create figure
fig = go.Figure()

# Add surface trace
# Add bar trace for movie_author_df
fig.add_trace(go.Bar(x=movie_author_df['author'], y=movie_author_df['count'], name='Movies', visible=True, marker_color='#FF4301'))

# Add bar trace for tv_author_df
fig.add_trace(go.Bar(x=tv_author_df['author'], y=tv_author_df['count'], name='TV Shows', visible=False, marker_color='#ffe100'))

# Add bar trace for anime_author_df
fig.add_trace(go.Bar(x=anime_author_df['author'], y=anime_author_df['count'], name='Anime', visible=False,  marker_color='#ff9200'))

# Update layout for better visualization
fig.update_layout(
    plot_bgcolor='white',  # Set background color to white
     xaxis=dict(title_text="Author"), # Set x-axis line color
     yaxis=dict(
         title_text="Post Count",
    ),
    updatemenus=[
       
        dict(
            active=0,
            buttons=list([
                 dict(label="Movie",
                     method="update",
                     args=[{"visible": [True, False, False]},
                           {"title": "Top 10 active authors for movies subreddit"}]),
                dict(label="Television",
                     method="update",
                     args=[{"visible": [False, True, False]},
                           {"title": "Top 10 active authors for television subreddit"}]),
                dict(label="Anime",
                     method="update",
                     args=[{"visible": [False, False, True]},
                           {"title": "Top 10 active authors for anime subreddit"}]),
               
            ]),
            x=0.75,  # Adjusted the dropdown position to the top
            xanchor='left',  # Anchored the dropdown to the left
            y=1.25,  # Adjusted the dropdown position to the top
            yanchor='top'  # Anchored the dropdown to the top
        )
    ])

# Set title
fig.update_layout(title_text="Top 10 active authors for movies subreddit")
fig.write_html("../../data/plots/Top_author_post.html")
fig.show()

Table for posts with top comments¶

In [43]:
#import data
anime_comments_df = pd.read_csv("../../data/csv/top_comments_anime.csv")
movies_comments_df = pd.read_csv("../../data/csv/top_comments_movies.csv")
tvshows_comments_df = pd.read_csv("../../data/csv/top_comments_tv_show.csv")
In [44]:
#remove rows with [deleted] author
movies_comments_df = movies_comments_df[movies_comments_df['author'] != '[deleted]']
# creating a new column content
movies_comments_df["selftext"].fillna(" ", inplace=True)
movies_comments_df["Content"] = "Title: " + movies_comments_df["title"] + "Body: " + movies_comments_df["selftext"]
#Select columns and display top 10 records
selected_columns = ["Content", "num_comments","author"]
movies_comments_df = movies_comments_df[selected_columns].head(10)
movies_comments_df
Out[44]:
Content num_comments author
0 Title: Name a single movie, where the sequel o... 35446 dpemerson76
1 Title: Hi, I’m Keanu Reeves, AMABody: 33376 lionsgate
2 Title: Official Discussion - Zack Snyder's Jus... 30350 LiteraryBoner
3 Title: 1 Teen Dead, Another Critically Injured... 28664 prsnreddit
4 Title: Hello, I’m Nicolas Cage and welcome to ... 26670 lionsgate
5 Title: Official Oscars Discussion Thread 2022B... 22097 LiteraryBoner
6 Title: Official Discussion - Spider-Man: No Wa... 21419 LiteraryBoner
8 Title: Official Discussion - Avatar: The Way o... 19888 LiteraryBoner
9 Title: Official Oscars Discussion Thread 2023B... 18380 LiteraryBoner
10 Title: Hi, I’m Tobey Maguire, actor/executive ... 17793 officialtobeymaguire
In [45]:
#remove rows with [deleted] author
anime_comments_df = anime_comments_df[anime_comments_df['author'] != '[deleted]']
# creating a new column content
anime_comments_df["selftext"].fillna(" ", inplace=True)
anime_comments_df["Content"] = "Title: " + anime_comments_df["title"] + "Body: " + anime_comments_df["selftext"]
#Select columns and display top 10 records
selected_columns = ["Content", "num_comments","author"]
anime_comments_df = anime_comments_df[selected_columns].head(10)
anime_comments_df
Out[45]:
Content num_comments author
0 Title: Casual Discussion Fridays - Week of Mar... 18528 AnimeMod
1 Title: Casual Discussion Fridays - Week of Apr... 18074 AnimeMod
2 Title: Casual Discussion Fridays - Week of Apr... 16970 AnimeMod
3 Title: Casual Discussion Fridays - Week of Mar... 16651 AnimeMod
4 Title: Casual Discussion Fridays - Week of Jun... 16258 AutoModerator
5 Title: Casual Discussion Fridays - Week of Apr... 16229 AnimeMod
6 Title: Casual Discussion Fridays - Week of Apr... 16104 AnimeMod
7 Title: Casual Discussion Fridays - Week of Apr... 16084 AnimeMod
8 Title: Casual Discussion Fridays - Week of Sep... 15984 AutoModerator
9 Title: Casual Discussion Fridays - Week of Jan... 15277 AnimeMod
In [46]:
#remove rows with [deleted] author
tvshows_comments_df = tvshows_comments_df[tvshows_comments_df['author'] != '[deleted]']
# creating a new column content
tvshows_comments_df["selftext"].fillna(" ", inplace=True)
tvshows_comments_df["Content"] = "Title: " + tvshows_comments_df["title"] + "Body: " + tvshows_comments_df["selftext"]
#Select columns and display top 10 records
selected_columns = ["Content", "num_comments","author"]
tvshows_comments_df = tvshows_comments_df[selected_columns].head(10)
tvshows_comments_df
Out[46]:
Content num_comments author
0 Title: Will Smith Slaps Chris Rock at The Osca... 9332 Midnight_Oil_
1 Title: Dave Chappelle Lands Emmy Nomination fo... 7589 Neo2199
2 Title: ‘House of the Dragon’ Star Steve Toussa... 6660 overvivideo
3 Title: GLAAD condemns Dave Chappelle, Netflix ... 6550 LarryPeru
4 Title: Dave Chappelle Calls Kids Who Dared Cri... 5976 inthetownwhere
5 Title: ‘Cowboy Bebop’ Canceled By Netflix Afte... 5974 MarvelsGrantMan136
6 Title: What color is an elf? Or a Sea Snake? A... 5828 ewzetf
7 Title: Gina Carano Star Wars: She is No Longer... 5745 thetanhausergate
8 Title: Netflix Co-CEO Ted Sarandos Defends Dav... 5740 Neo2199
9 Title: The Last of Us - Series Premiere Discus... 5721 NicholasCajun
In [48]:
selected_columns_movies = ['Content', 'num_comments', 'author']
data_values_movies = [movies_comments_df[col].tolist() for col in selected_columns_movies]

selected_columns_tv = ['Content', 'num_comments', 'author']
data_values_tv = [tvshows_comments_df[col].tolist() for col in selected_columns_tv]

selected_columns_anime = ['Content', 'num_comments', 'author']
data_values_anime = [anime_comments_df[col].tolist() for col in selected_columns_anime]

# Specify the new header names
header_values = ['Content', 'Number of Comments', 'Author']

header_color="#FF4301"
body_color="lightgrey"
font_color = "white"
# Create figure
fig = go.Figure()

# Add surface trace
# Add bar trace for movie_author_df
fig.add_trace(go.Table(header=dict(values=header_values, fill_color=header_color, font=dict(color=font_color)), 
                       cells=dict(values=data_values_movies, fill_color=body_color), 
                       columnwidth=[2.5, 0.25, 0.4]))

# Add bar trace for tv_author_df
fig.add_trace(go.Table(header=dict(values=header_values, fill_color=header_color, font=dict(color=font_color)), 
                       cells=dict(values=data_values_tv, fill_color=body_color),
                       visible=False, 
                       columnwidth=[2.5, 0.25, 0.4]))

# Add bar trace for anime_author_df
fig.add_trace(go.Table(header=dict(values=header_values, fill_color=header_color, font=dict(color=font_color)), 
                       cells=dict(values=data_values_anime, fill_color=body_color),
                       visible=False, 
                       columnwidth=[2.5, 0.25, 0.4]))

# Update layout for better visualization
fig.update_layout(
    updatemenus=[
        dict(
            active=0,
            buttons=list([
                 dict(label="Movie",
                     method="update",
                     args=[{"visible": [True, False, False]},
                           {"title": "Top 10 posts having maximum comments for movies subreddit"}]),
                dict(label="Television",
                     method="update",
                     args=[{"visible": [False, True, False]},
                           {"title": "Top 10 posts having maximum comments for television subreddit"}]),
                dict(label="Anime",
                     method="update",
                     args=[{"visible": [False, False, True]},
                           {"title": "Top 10 posts having maximum comments for anime subreddit"}]),]),
            x=0.75,  # Adjusted the dropdown position to the top
            xanchor='left',  # Anchored the dropdown to the left
            y=1.1,  # Adjusted the dropdown position to the top
            yanchor='top'  # Anchored the dropdown to the top
        )
    ],
    margin=dict(l=10, r=10, t=15, b=20),  # Adjust margins
    height=1000  # Adjust height
    )

# Set title
fig.update_layout(title_text="Top 10 Post having maximum comments for movies subreddit")

# Export the figure to a html file
fig.write_html("../../data/plots/table_top_comments.html")

fig.show()

Table for posts with top score¶

In [49]:
#import data
scores_df = pd.read_csv("../../data/csv/top_author_score_postcount_eda.csv")
scores_df.head()
Out[49]:
Unnamed: 0 subreddit title num_comments selftext author score rank count
0 0 anime "Berserk" creator Kentaro Miura dead at 54 1762 NaN enterthedragonpunch 33384 1 1
1 1 anime Who will be the first seed in Best Girl 8? 619 Hi everyone, we are currently trialing a new f... mpp00 31830 2 241
2 2 anime Best Girl 9 Prediction Tournament! 264 NaN mpp00 30302 3 241
3 3 anime The Devil is a Part-Timer Season 2 Announced! 2486 NaN Srikkk 30213 4 32
4 4 anime "Spice and Wolf" New Anime Announced 1897 NaN dorkmax_executives 29222 5 304
In [50]:
# creating a new column content
scores_df["selftext"].fillna(" ", inplace=True)
scores_df["Content"] = "Title: " + scores_df["title"] + "Body: " + scores_df["selftext"]
#Select columns and display top 10 records
selected_columns = ["Content", "score","author","subreddit"]
scores_df = scores_df[selected_columns]
scores_df.head()
Out[50]:
Content score author subreddit
0 Title: "Berserk" creator Kentaro Miura dead at... 33384 enterthedragonpunch anime
1 Title: Who will be the first seed in Best Girl... 31830 mpp00 anime
2 Title: Best Girl 9 Prediction Tournament! Body: 30302 mpp00 anime
3 Title: The Devil is a Part-Timer Season 2 Anno... 30213 Srikkk anime
4 Title: "Spice and Wolf" New Anime AnnouncedBod... 29222 dorkmax_executives anime
In [51]:
movie_score_df = scores_df[scores_df['subreddit'] == 'movies']
tv_score_df = scores_df[scores_df['subreddit'] == 'television']
anime_score_df = scores_df[scores_df['subreddit'] == 'anime']
In [52]:
selected_columns_movies = ['Content', 'score', 'author']
data_values_movies = [movie_score_df[col].tolist() for col in selected_columns_movies]

selected_columns_tv = ['Content', 'score', 'author']
data_values_tv = [tv_score_df[col].tolist() for col in selected_columns_tv]

selected_columns_anime = ['Content', 'score', 'author']
data_values_anime = [anime_score_df[col].tolist() for col in selected_columns_anime]

# Specify the new header names
header_values = ['Content', 'Score', 'Author']

header_color="#FF4301"
body_color="lightgrey"
font_color = "white"
# Create figure
fig = go.Figure()

# Add trace for movie_author_df
fig.add_trace(go.Table(header=dict(values=header_values, fill_color=header_color, font=dict(color=font_color)), 
                       cells=dict(values=data_values_movies, fill_color=body_color), 
                       columnwidth=[2, 0.25, 0.45]))

# Add  trace for tv_author_df
fig.add_trace(go.Table(header=dict(values=header_values, fill_color=header_color, font=dict(color=font_color)), 
                       cells=dict(values=data_values_tv, fill_color=body_color),
                       visible=False, 
                       columnwidth=[2, 0.25, 0.45]))

# Add  trace for anime_author_df
fig.add_trace(go.Table(header=dict(values=header_values, fill_color=header_color, font=dict(color=font_color)), 
                       cells=dict(values=data_values_anime, fill_color=body_color),
                       visible=False, 
                       columnwidth=[2.5, 0.2, 0.45]))

# Update layout for better visualization
fig.update_layout(
    updatemenus=[
        dict(
            active=0,
            buttons=list([
                 dict(label="Movie",
                     method="update",
                     args=[{"visible": [True, False, False]},
                           {"title": "Top 10 posts having maximum score for movies subreddit"}]),
                dict(label="Television",
                     method="update",
                     args=[{"visible": [False, True, False]},
                           {"title": "Top 10 posts having maximum score for television subreddit"}]),
                dict(label="Anime",
                     method="update",
                     args=[{"visible": [False, False, True]},
                           {"title": "Top 10 posts having maximum score for anime subreddit"}]),]),
            x=0.75,  # Adjusted the dropdown position to the top
            xanchor='left',  # Anchored the dropdown to the left
            y=1.15,  # Adjusted the dropdown position to the top
            yanchor='top'  # Anchored the dropdown to the top
        )
    ],
    margin=dict(l=20, r=20, t=15, b=20),  # Adjust margins
    height=350  # Adjust height
    )


# Set title
fig.update_layout(title_text="Top 10 Post having maximum score for movies subreddit")

# Export the figure to a html file
fig.write_html("../../data/plots/table_top_score.html")

fig.show()

External Dataset Table¶

In [54]:
df_external_movies = pd.read_csv("../../data/csv/best_movies_netflix_ext.csv") 
df_external_movies.head()
Out[54]:
TITLE RELEASE_YEAR SCORE NUMBER_OF_VOTES DURATION MAIN_GENRE MAIN_PRODUCTION
0 David Attenborough: A Life on Our Planet 2020 9.0 31180 83 documentary GB
1 Inception 2010 8.8 2268288 148 scifi GB
2 Forrest Gump 1994 8.8 1994599 142 drama US
3 Anbe Sivam 2003 8.7 20595 160 comedy IN
4 Bo Burnham: Inside 2021 8.7 44074 87 comedy US
In [55]:
df_external_shows = pd.read_csv("../../data/csv/best_shows_netflix_ext.csv") 
df_external_movies.head()
Out[55]:
TITLE RELEASE_YEAR SCORE NUMBER_OF_VOTES DURATION MAIN_GENRE MAIN_PRODUCTION
0 David Attenborough: A Life on Our Planet 2020 9.0 31180 83 documentary GB
1 Inception 2010 8.8 2268288 148 scifi GB
2 Forrest Gump 1994 8.8 1994599 142 drama US
3 Anbe Sivam 2003 8.7 20595 160 comedy IN
4 Bo Burnham: Inside 2021 8.7 44074 87 comedy US